# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import ParameterGrid
from sklearn.model_selection import train_test_split
from itertools import product, chain
from tqdm import tqdm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

RANDOM_STATE = 0

def get_x(df):
    df['Cabin'].fillna('Unknown', inplace=True)
    df['Embarked'].fillna('Unknown', inplace=True)
    df['Age'].fillna(-1, inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    df['Title'].fillna('na', inplace=True)
    df = df.drop(['Name', 'PassengerId', 'Cabin', 'Embarked'], axis=1)
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
    
    df = df.drop(['Ticket'], axis=1)
    columns = list(df.columns)
    if 'Survived' in columns:
        columns.remove('Survived')
    cat_features = np.where(df[columns].dtypes != np.float)[0]
    return df[columns].values, cat_features


def get_xy(df):
    X, _ = get_x(df)
    y = df['Survived']
    return X, y

#  
def cross_val(X, y, X_test, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    acc = []
    predict = None
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]
        
        X_valid = X[val_ind]
        y_valid = y[val_ind]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc)/n_splits
    
def catboost_GridSearchCV(X, y, X_test, params, cat_features, n_splits=5):
    ps = {'acc':0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        acc = cross_val(X, y, X_test, prms, cat_features, n_splits=5)

        if acc>ps['acc']:
            ps['acc'] = acc
            ps['param'] = prms
    print('Acc: '+str(ps['acc']))
    print('Params: '+str(ps['param']))
    
    return ps['param']
    
    
def main():
    train = pd.read_csv("../input/train.csv")
    test = pd.read_csv("../input/test.csv")
    
    X_train, y_train = get_xy(train)
    X_test, cat_features = get_x(test)
    
    params = {'depth':[2, 3, 4],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg':np.logspace(-20, -19, 3)
    }
    
    param = catboost_GridSearchCV(X_train, y_train, X_test, params, cat_features)

    clf = CatBoostClassifier(iterations=2500,
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            eval_metric = 'Accuracy',
                            leaf_estimation_iterations = 10,
                            use_best_model=True
    )
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                        y_train, 
                                                        shuffle=True,
                                                        random_state=RANDOM_STATE,
                                                        train_size=0.8,
                                                        stratify=y_train
    )
    clf.fit(X_train, 
            y_train,
            cat_features=cat_features,
            logging_level='Silent',
            eval_set=(X_valid, y_valid)
    )
    
    sub = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':np.array(clf.predict(X_test)).astype(int)})
    sub.to_csv('cat_sub_1.csv',index=False)
    
if __name__=='__main__':
    main()